The goal of this project is to provide an insight of what are the main features that affect the filtering rate at port. Additionally, we aim to construct a predictive model that will be able to give an accurate result for the filtering rate beforehand.
# General functions and models #
library(reticulate) # python
library(xgboost) #
library(SHAPforxgboost)
library(glmnet)
require(caretEnsemble) # ensemble modelling
# general visualisation #
library(ggplot2) # visualisation
library(grid) # visualisation
library(corrplot) # correlation plot matrix
library(RColorBrewer) # Colours
require(DT) # display data in html tables
library(here)
# general data manipulation #
library(dplyr) # data manipulation
library(data.table) # data manipulation
library(tibble) # data wrangling
library(caret) # Clasiffication And Regression Training
library(psych) # skew
library(VIM)
# specific visualisation #
# specific data manipulation #
library(janitor) # clean headers
library(naniar) # missing data
library(mice) # Imputation
library(broom)
library(lubridate)
# Date plus forecast
## Helper functions multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
plots <- c(list(...), plotlist)
numPlots = length(plots)
if (is.null(layout)) {
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots==1) {
print(plots[[1]])
} else {
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
for (i in 1:numPlots) {
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}import os
import pandas as pd
import numpy as np
df_port = pd.read_excel('../input/port/Data Diaria 2018 2019.xlsx', usecols = 'B:U')
df_port.rename(columns = {'DÃa Operacional': 'date'}, inplace=True)
df_port = df_port.drop(columns = ["date"])
# df_port['date'] = pd.to_datetime(df_port['date'])
# df_port = df_port.set_index('date')
# display(set(df_filtering.loc[~df_filtering["pH"].astype(str).str.isdigit(), "pH"].tolist()))
df_port = df_port.replace('S/O', np.nan) ## missing value string
df_port = df_port.replace({',': '.'}, regex=True)
df_port['COT'] = df_port['COT'].replace('< 0.1', 0)
df_port = df_port.dropna(how = 'all')
# df_port = df_port.apply(pd.to_numeric)
# df_port.fillna(df_port.mean(), inplace = True)
df_port.to_csv("data/df_port.csv", index = False)port <- as_tibble(fread("data/df_port.csv"))
port <- port %>%
clean_names(., "snake") %>%
rename(
hum_pct = humedad_percent, # % de humedad
x10_pct = x10_number_percent,
ph = p_h, # PH
tft_ca_pct = tft_ca_percent, #
tft_floc = tft_floculante_g_ton, # ? Floculante en g/Ton
sol_filt_pct = percent_solidos_filtrado, # % de sólidos filtrados
sol_recep_pct = percent_solidos_recep, # % de sólidos recibido
esf_corte = esfuerzo_de_corte, # Esfuerzo de corte
tasa_filt = tasa_kg_m2_hr, # Tasa de filtrado
Na_gpl = sodio_mg_l, # Concentración de sodio en g/L
Cu_S_gpl = cu_s_mg_l, # Concentración sulfuro de cobre en g/L
tft_visc = tft_viscosidad, # tft Viscocidad
insol = insoluble, # Cantidad de insolubles ???
temp = temperatura # Temperatura
)
port$na_count <- apply(port, 1, function(x) sum(is.na(x)))
dim(port)## [1] 582 20
## hum_pct tasa_filt x10_pct insol
## Min. : 7.950 Min. :410.0 Min. : 0.00 Min. : 0.000
## 1st Qu.: 8.460 1st Qu.:534.5 1st Qu.:21.72 1st Qu.: 6.550
## Median : 8.670 Median :567.7 Median :23.01 Median : 7.590
## Mean : 8.703 Mean :570.2 Mean :22.98 Mean : 7.733
## 3rd Qu.: 8.850 3rd Qu.:604.0 3rd Qu.:24.20 3rd Qu.: 8.850
## Max. :10.080 Max. :717.4 Max. :28.63 Max. :13.000
## NA's :2 NA's :2 NA's :1 NA's :1
## Na_gpl sulfato_mg_l cloruro_mg_l conduct
## Min. :157.0 Min. : 459 Min. : 57.0 Min. : 901
## 1st Qu.:487.5 1st Qu.:1534 1st Qu.:129.0 1st Qu.:2945
## Median :580.0 Median :1701 Median :154.0 Median :3490
## Mean :565.4 Mean :1714 Mean :159.6 Mean :3341
## 3rd Qu.:650.5 3rd Qu.:1880 3rd Qu.:187.0 3rd Qu.:3870
## Max. :863.0 Max. :2709 Max. :331.0 Max. :4830
## NA's :3 NA's :3 NA's :3 NA's :3
## ph cot ca_ppm tft_ca_pct
## Min. : 6.25 Min. : 0.00 Min. : 24.0 Min. :0.1000
## 1st Qu.: 9.50 1st Qu.:10.00 1st Qu.:206.0 1st Qu.:0.2000
## Median :10.01 Median :14.50 Median :272.0 Median :0.2600
## Mean :10.01 Mean :15.52 Mean :287.8 Mean :0.2631
## 3rd Qu.:10.65 3rd Qu.:19.50 3rd Qu.:338.0 3rd Qu.:0.3000
## Max. :12.03 Max. :62.00 Max. :720.0 Max. :0.7800
## NA's :3 NA's :34 NA's :32 NA's :161
## Cu_S_gpl tft_visc tft_floc sol_filt_pct
## Min. :0.0200 Min. : 2.585 Min. :0.581 Min. :57.20
## 1st Qu.:0.1100 1st Qu.: 5.440 1st Qu.:1.358 1st Qu.:61.55
## Median :0.1600 Median : 6.240 Median :1.741 Median :61.99
## Mean :0.1972 Mean : 6.329 Mean :1.951 Mean :62.15
## 3rd Qu.:0.2400 3rd Qu.: 7.040 3rd Qu.:2.260 3rd Qu.:62.79
## Max. :1.1700 Max. :12.185 Max. :8.678 Max. :65.03
## NA's :3 NA's :1 NA's :54 NA's :4
## sol_recep_pct temp esf_corte na_count
## Min. :53.67 Min. :17.90 Min. : 2.508 Min. : 0.000
## 1st Qu.:61.62 1st Qu.:22.30 1st Qu.: 5.643 1st Qu.: 0.000
## Median :62.07 Median :25.20 Median : 7.059 Median : 1.000
## Mean :61.97 Mean :25.41 Mean : 7.365 Mean : 1.179
## 3rd Qu.:62.42 3rd Qu.:28.60 3rd Qu.: 8.855 3rd Qu.: 2.000
## Max. :64.34 Max. :31.70 Max. :15.763 Max. :16.000
## NA's :2 NA's :61 NA's :313
# sapply(port, class)
categ_cols <- names(port[,sapply(port, is.character)])
cat('There are', length(categ_cols), 'remaining columns with character values')## There are 0 remaining columns with character values
Since categorical variables enter into statistical models differently than continuous variables, storing data as factors insures that the modeling functions will treat such data correctly. The code performs the following tasks: rename variable names, change data type to factor and order ordinal factors.
Descriptive statistics describe quantitatively the basic features of the data. These statistics will give us a head start by providing information about stuff like skewness, outliers (range) missing data points and (near) zero variance.
With this plot we can see that the variables that present the highest amount of missing data are: - esfuerzo de corte - tft_ca_ppt - temp - tft_floc - cot - ca_ppm
The variables mentiones above are represented graphycally to search for any pattern. In this graph, the data which is missing, is given a value of 10% less than the minimum value of the available data. Then, it’s represented in a dispersion graph were the red color represents the missing data.
p1 <- port %>%
ggplot(aes(x = esf_corte, y = tasa_filt)) +
geom_miss_point()
p2 <- port %>%
ggplot(aes(x = tft_ca_pct, y = tasa_filt)) +
geom_miss_point()
p3 <- port %>%
ggplot(aes(x = temp, y = tasa_filt)) +
geom_miss_point()
p4 <- port %>%
ggplot(aes(x = cot, y = tasa_filt)) +
geom_miss_point()
p5 <- port %>%
ggplot(aes(x = ca_ppm, y = tasa_filt)) +
geom_miss_point()
p6 <- port %>%
ggplot(aes(x = tft_floc, y = tasa_filt)) +
geom_miss_point()
layout <- matrix(c(1,2,3,4,5,6), 3, 2,byrow=TRUE)
multiplot(p1, p2, p3, p4, p5, p6, layout=layout)Fig. 1
There is no insight in regards to any pattern correlated to the filter rate.
In this part, we aim to answer the question: in which variables observations are missing, and how many? Aggregation plots are a useful tool for answering these questions. The one-liner below is all you need.
##
## Variables sorted by number of missings:
## Variable Count
## esf_corte 0.537800687
## tft_ca_pct 0.276632302
## temp 0.104810997
## tft_floc 0.092783505
## cot 0.058419244
## ca_ppm 0.054982818
## sol_filt_pct 0.006872852
## Na_gpl 0.005154639
## sulfato_mg_l 0.005154639
## cloruro_mg_l 0.005154639
## conduct 0.005154639
## ph 0.005154639
## Cu_S_gpl 0.005154639
## hum_pct 0.003436426
## tasa_filt 0.003436426
## sol_recep_pct 0.003436426
## x10_pct 0.001718213
## insol 0.001718213
## tft_visc 0.001718213
## na_count 0.000000000
It’s difficult to give any insight in regards to missing data.
Finally, we decide to imputate the data. The technique used is Predictive Mean Matching (PAM) algorithm. Two data frames are generated, one with imputated data df, and the other one with only the rows that have no missing data df_port.
After engineering new features and before starting the modelling, we will visualise the relations between our parameters using a correlation matrix. For this, we need to change all the input features into a numerical format. The visualisation uses the corrplot function from the eponymous package. Corrplot gives us great flexibility in manipulating the style of our plot.
What we see below, are the colour-coded correlation coefficients for each combination of two features. In simplest terms: this shows whether two features are connected so that one changes with a predictable trend if you change the other. The closer this coefficient is to zero the weaker is the correlation. Both 1 and -1 are the ideal cases of perfect correlation and anti-correlation (dark blue and dark red in the plots below).
Here, we are of course interested if and how strongly our features correlate with the tasaDeFiltrado, the prediction of which is the ultimate goal of this challenge. But we also want to know whether our potential predictors are correlated among each other, so that we can reduce the collinearity in our data set and improve the robustness of our prediction:
We find:
Analyzing only the sampled data shows news observations, wich shouldn’t be expected. For instance, Na_gpl appears to be highly correlated to the filtering rate whilst this was not observed in the imputated data set.
This is a tricky situation since we can’t give a good feedback on what’s going on. Why does the operator take some samples whilst in other he decides to omit? We ought to talk with the people in charge of the sample and do further analysis before we can giver an
The following plot is usefull to visualize the distribution of the data. We can see that Na_gpl has a high amount of points lower and higher than the 75% percentile.
## Loading required package: viridisLite
df.m %>%
ggplot( aes(x=variable, y=value, fill=variable)) +
geom_boxplot() +
scale_fill_viridis(discrete = TRUE, alpha=0.6) +
geom_jitter(color="black", size=0.4, alpha=0.9) +
theme(
legend.position="none",
plot.title = element_text(size=20),
axis.text=element_text(size=16),
axis.text.x = element_text(angle = 45),
) +
ggtitle("Diagrama de cajas") +
xlab("")import pandas as pd
import numpy as np
import scipy as sp
import scipy.stats
from scipy.cluster import hierarchy as hc
import matplotlib.pyplot as plt
df = pd.read_csv("df.csv")
df = df.drop(columns = ["Unnamed: 0", "na_count"]).reset_index(drop = True)
corr = np.round(scipy.stats.spearmanr(df).correlation, 4)
corr_condensed = hc.distance.squareform(1-corr)
z = hc.linkage(corr_condensed, method='average')
fig = plt.figure(figsize=(16,10))
dendrogram = hc.dendrogram(z, labels=df.columns, orientation='left', leaf_font_size=14)
plt.show()## [1] 582 20
# Pre-Processing of DataSet i.e. train : test split
tr_te_i <- createDataPartition(df$target, p = 0.8, list = FALSE)
tr <- df[tr_te_i,]
te <- df[-tr_te_i,]
dim(tr)## [1] 466 20
## [1] 116 20
tri <- 1:nrow(tr)
# Check Normal distribution on target
tr$target <- log(tr$target)
qqnorm(tr$target)
qqline(tr$target)# Check Normal distribution on target
df_port$target <- log(df_port$target)
qqnorm(df_port$target)
qqline(df_port$target)num_var <- which(sapply(tr_te, is.numeric)) #index vector numeric variables
num_varnames <- names(num_var) #saving names vector for use later on
num_varnames <- num_varnames[!(num_varnames %in% c("target"))]
df_num <- tr_te[, names(tr_te) %in% num_varnames]
for(i in 1:ncol(df_num)){
if (abs(skew(df_num[,i]))>0.8){
df_num[,i] <- log(df_num[,i] +1)
}
}
pre_num <- preProcess(df_num, method=c("center", "scale"))
print(pre_num)## Created from 582 samples and 18 variables
##
## Pre-processing:
## - centered (18)
## - ignored (0)
## - scaled (18)
# Unify both categorical and numerical.
train_test <- df_norm
# With the original train split index, generate the data set.
train <- train_test[tri,] # X_train
test <- train_test[-tri,] # X_test
y <- tr$target # X_train
# y_train <- tr$target
# y_test <- te$target
write.csv(train, file = "X_train.csv")
write.csv(test, file = "X_test.csv")
write.csv(y, file = "y_train.csv")
write.csv(te$target, file = "y_test.csv")trControl <- trainControl(
method="cv",
number=7,
savePredictions="all",
index=createResample(tr$target, 7),
allowParallel = TRUE
)
xgbTreeGrid <- expand.grid(nrounds = 400, max_depth = seq(2,6,by = 1), eta = 0.1, gamma = 0, colsample_bytree = 1.0, subsample = 1.0, min_child_weight = 4)
glmnetGridElastic <- expand.grid(.alpha = 0.9, .lambda = 0.009) ## notice the . before the parameter
glmnetGridLasso <- expand.grid(.alpha = 1, .lambda = seq(0.001,0.1,by = 0.001))
glmnetGridRidge <- expand.grid(.alpha = 0, .lambda = seq(0.001,0.1,by = 0.001))
set.seed(333)
modelList <<- caretList(
x = as.matrix(train),
y = tr$target,
trControl=trControl,
metric="RMSE",
tuneList=list(
## Do not use custom names in list. Will give prediction error with greedy ensemble. Bug in caret.
xgbTree = caretModelSpec(method="xgbTree", tuneGrid = xgbTreeGrid, nthread = 8),
glmnet=caretModelSpec(method="glmnet", tuneGrid = glmnetGridElastic), ## Elastic, highly correlated with lasso and ridge regressions
#
# glmnet=caretModelSpec(method="glmnet", tuneGrid = glmnetGridLasso), ## Lasso
glmnet=caretModelSpec(method="glmnet", tuneGrid = glmnetGridRidge) ## Ridge
#svmLinear3= caretModelSpec(method="svmLinear3", tuneLenght = 20) ## SVM
)
)## xgbTree glmnet glmnet.1
## xgbTree 1.0000000 0.8155238 0.7828588
## glmnet 0.8155238 1.0000000 0.9052348
## glmnet.1 0.7828588 0.9052348 1.0000000
## $RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## xgbTree 0.06330538 0.06548392 0.06717139 0.06817997 0.06920201 0.07741116
## glmnet 0.07242736 0.07840845 0.07941515 0.07962689 0.08113677 0.08645527
## glmnet.1 0.06897974 0.07354095 0.07741870 0.07866448 0.08477519 0.08762063
## NA's
## xgbTree 0
## glmnet 0
## glmnet.1 0
##
## $Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## xgbTree 0.3809804 0.4155011 0.4727092 0.4511680 0.4836834 0.5061176 0
## glmnet 0.1665313 0.2340235 0.2740467 0.2715755 0.3244529 0.3434975 0
## glmnet.1 0.1229701 0.2110392 0.3034446 0.2960307 0.3969368 0.4298480 0
set.seed(333)
greedyEnsemble <- caretEnsemble(
modelList,
metric="RMSE",
trControl=trainControl(
number=7, method = "cv"
))
summary(greedyEnsemble)## The following models were ensembled: xgbTree, glmnet, glmnet.1
## They were weighted:
## -0.3852 0.8189 0.0774 0.1639
## The resulting RMSE is: 0.0681
## The fit for each individual model on the RMSE is:
## method RMSE RMSESD
## xgbTree 0.06817997 0.004673665
## glmnet 0.07962689 0.004252056
## glmnet.1 0.07866448 0.007164357
# modelList$xgbTree
# hyperparameter tuning results
X = select(tr_te, -c(target))[tri,]
param_dart <- list(objective = "reg:linear", # For regression
nrounds = 400,
max_depth = 6,
eta = 0.01,
gamma = 0.00,
colsample_bytree = 1,
min_child_weight = 4,
subsample = 1)
mod <- xgboost::xgboost(data = as.matrix(X),
label = as.matrix(tr$target),
xgb_param = param_dart, nrounds = param_dart$nrounds,
verbose = FALSE, nthread = parallel::detectCores() - 2,
early_stopping_rounds = 8)
shap_values <- shap.values(xgb_model = mod, X_train = X)
shap_long <- shap.prep(xgb_model = mod, X_train = X)## Warning: `as.tibble()` is deprecated, use `as_tibble()` (but mind the new semantics).
## This warning is displayed once per session.
imp_matrix %>%
ggplot(aes(reorder(Feature, Gain, FUN = max), Gain, fill = Feature)) +
geom_col() +
coord_flip() +
theme(legend.position = "none") +
labs(x = "Features", y = "Importance")## [1] "temp" "sol_filt_pct" "conduct" "sol_recep_pct"
## [5] "Na_gpl" "sulfato_mg_l" "tft_visc" "cot"
## [9] "x10_pct" "hum_pct" "ca_ppm" "tft_floc"
## [13] "insol" "cloruro_mg_l" "esf_corte" "tft_ca_pct"
## [17] "ph" "Cu_S_gpl"
imp_matrix <- as.tibble(data.frame(Feature = row.names(imp_matrix), Importance = imp_matrix))
imp_matrix## # A tibble: 18 x 2
## Feature Importance
## <fct> <dbl>
## 1 temp 0.0290
## 2 sol_filt_pct 0.0157
## 3 conduct 0.0145
## 4 sol_recep_pct 0.0126
## 5 Na_gpl 0.0103
## 6 sulfato_mg_l 0.00863
## 7 tft_visc 0.00834
## 8 cot 0.00773
## 9 x10_pct 0.00703
## 10 hum_pct 0.00588
## 11 ca_ppm 0.00578
## 12 tft_floc 0.00515
## 13 insol 0.00509
## 14 cloruro_mg_l 0.00419
## 15 esf_corte 0.00339
## 16 tft_ca_pct 0.00273
## 17 ph 0.00246
## 18 Cu_S_gpl 0.00241
imp_matrix %>%
ggplot(aes(reorder(Feature, Importance, FUN = max), Importance, fill = Feature)) +
geom_col() +
coord_flip() +
theme(legend.position = "none") +
labs(x = "Variables", y = "Importancia relativa")## temp sol_filt_pct conduct sol_recep_pct Na_gpl
## 0.028976397 0.015692323 0.014545557 0.012552678 0.010283734
## sulfato_mg_l tft_visc cot x10_pct hum_pct
## 0.008626686 0.008341392 0.007728433 0.007027991 0.005878100
## ca_ppm tft_floc insol cloruro_mg_l esf_corte
## 0.005777914 0.005153675 0.005088682 0.004191773 0.003394108
## tft_ca_pct ph Cu_S_gpl
## 0.002729461 0.002461570 0.002412079
The following plot shows the most important features, and their impact on the filtering rate.
A SHAP force plot shows the contribution of the most important features divided by quantity.
## The SHAP values of the Rest 14 features were summed into variable 'rest_variables'.
## Data has N = 466 | zoom in length is 50 at location 250.
The distribution of the real values agaisnt the contribution that they have on the filtering rate.
fig_list <- lapply(names(shap_values$mean_shap_score)[1:4],
shap.plot.dependence, data_long = shap_long)
gridExtra::grid.arrange(grobs = fig_list, ncol = 2)shap_int <- shap.prep.interaction(xgb_mod = mod, X_train = as.matrix(X))
g3 <- shap.plot.dependence(data_long = shap_long,
data_int = shap_int,
x= "temp", y = "conduct",
color_feature = "conduct")
g4 <- shap.plot.dependence(data_long = shap_long,
data_int = shap_int,
x= "conduct", y = "sol_filt_pct",
color_feature = "sol_filt_pct")
gridExtra::grid.arrange(g3, g4, ncol=2)